library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(Rtsne)
library(expss)
library(ClusterR)
library(DESeq2) ; library(biomaRt)
library(knitr)
Load preprocessed dataset (preprocessing code in 01_data_preprocessing.Rmd)
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
DE_info = DE_info %>% data.frame
datMeta = datMeta %>% mutate(ID = title)
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# Update DE_info with Neuronal information
DE_info = DE_info %>% mutate('entrezgene'=rownames(.) %>% as.numeric) %>%
dplyr::rename('padj' = adj.P.Val, 'log2FoldChange' = logFC) %>%
left_join(datGenes %>% dplyr::select(entrezgene, ensembl_gene_id) %>%
dplyr::rename('ID' = ensembl_gene_id), by = 'entrezgene') %>%
left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(significant=padj<0.05 & !is.na(padj))
rm(GO_annotations)
There seems to be a small group of genes with lower mean expression than most genes
There are some samples with lower level of expression than most samples
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr))
p1 = plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
xlab('Mean Expression') + ylab('Density') + ggtitle('Mean Expression distribution by Gene') +
theme_minimal()
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr))
p2 = plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
xlab('Mean Expression') + ylab('Density') +
theme_minimal() + ggtitle('Mean expression distribution by Sample')
grid.arrange(p1, p2, nrow=1)
rm(p1, p2, plot_data)
The differences in level of expression between Phenotype information are not statistically significant
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% left_join(datMeta, by='ID')
p1 = plot_data %>% ggplot(aes(Ethnicity, Mean, fill = Ethnicity)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
xlab('Batch') + ylab('Mean Expression') + theme_minimal() + theme(legend.position = 'none')
p2 = plot_data %>% ggplot(aes(Sex, Mean, fill = Sex)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
xlab('Gender') + ylab('') + theme_minimal() + theme(legend.position = 'none')
grid.arrange(p1,p2, nrow = 1)
rm(p1,p2)
The two groups of genes seem to be partially characterised by genes with Neuronal function
In general, the ASD group has a lower mean than the control group (opposite to Gandal and Gupta’s results)
Only the differences in mean expression between Neuronal and non-neuronal genes are statistically significant
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr)) %>%
left_join(GO_neuronal, by='ID') %>% mutate('Neuronal'=ifelse(is.na(Neuronal),F,T))
p1 = plot_data %>% ggplot(aes(Mean, color=Neuronal, fill=Neuronal)) + geom_density(alpha=0.3) +
theme_minimal() + theme(legend.position='bottom') +
ggtitle('Mean expression by gene')
p3 = plot_data %>% ggplot(aes(Neuronal, Mean, fill = Neuronal)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE)) + theme_minimal() +
ylab('Mean Expression') + theme(legend.position = 'none')
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% left_join(datMeta, by='ID')
p2 = plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + geom_density(alpha=0.3) +
theme_minimal() + theme(legend.position='bottom') +
ggtitle('Mean expression by Sample')
p4 = plot_data %>% ggplot(aes(Diagnosis, Mean, fill = Diagnosis)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE)) + theme_minimal() +
ylab('Mean Expression') + theme(legend.position = 'none')
grid.arrange(p1, p2, p3, p4, nrow=2)
rm(GO_annotations, plot_data, p1, p2, p3, p4)
In general there doesn’t seem to be a lot of variance in mean expression between autism and control samples by gene
plot_data = data.frame('ID'=rownames(datExpr),
'ASD'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']),
'CTL'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD'])) %>%
mutate(diff=ASD-CTL, abs_diff = abs(ASD-CTL)) %>%
mutate(std_diff = (diff-mean(diff))/sd(diff), distance = abs((diff-mean(diff))/sd(diff)))
plot_data %>% ggplot(aes(ASD, CTL, color = distance)) + geom_point(alpha = plot_data$abs_diff) +
geom_abline(color = 'gray') + scale_color_viridis(direction = -1) +
ggtitle('Mean expression ASD vs CTL') + theme_minimal() + coord_fixed()
summary(plot_data$std_diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -13.3392 -0.2758 0.0368 0.0000 0.3367 11.8097
#cat(paste0('Outlier genes: ', paste(plot_data$ID[abs(plot_data$std_diff)>3], collapse = ', ')))
There are 372 genes with a difference between Diagnoses larger than 3 SD to the distance distribution of all genes. Gene ENSG00000205670 has the largest difference in mean expression between ASD and CTL
There doesn’t seem to be a noticeable difference between mean expression by gene between Diagnosis groups
Samples with autism tend to have lower values than the control group (as we had already seen above)
plot_data = rbind(data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) +
geom_density(alpha=0.3) + theme_minimal())
plot_data = rbind(data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) +
geom_density(alpha=0.3) + theme_minimal() +
ggtitle('Mean expression by Gene (left) and by Sample (right) grouped by Diagnosis'))
subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)
Samples seems to separate relatively well by Diagnosis
ASD samples seem to be more evenly spread out than the Control samples
pca = datExpr %>% t %>% prcomp
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(datMeta, by='ID') %>% dplyr::select('ID','PC1','PC2','Diagnosis') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(alpha = 0.8) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme_minimal() + ggtitle('PCA of Samples')
rm(pca, plot_data)
Looks exactly the same as the PCA visualisation, just inverting the both axes
d = datExpr %>% t %>% dist
fit = cmdscale(d, k=2)
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=fit[,1], 'C2'=fit[,2]) %>% left_join(datMeta, by='ID') %>%
dplyr::select('C1','C2','Diagnosis') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point(alpha = 0.8) + theme_minimal() + ggtitle('MDS')
rm(d, fit, plot_data)
T-SNE seems to be struggling to separate the samples by Diagnosis
Using a perplexity of 2 the ASD samples seem to gather in the center and Controls in the periphery
perplexities = c(1,2,5,10)
ps = list()
for(i in 1:length(perplexities)){
set.seed(123)
tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
left_join(datMeta, by='ID') %>%
dplyr::select('C1','C2','Diagnosis') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point() + theme_minimal() +
ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}
grid.arrange(grobs=ps, nrow=2)
rm(ps, perplexities, tsne, i)
The First Principal Component explains over 97% of the total variance
There’s a really strong correlation between the mean expression of a gene and the 1st principal component
The magnitude of the second principal component seems to be related to the level of expression of the genes (this didn’t happen in the other datasets)
pca = datExpr %>% prcomp
plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))
plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() +
scale_color_viridis() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
rm(pca, plot_data)
Higher perplexities capture a cleaner visualisation of the data ordered by mean expression, in a similar (although not as linear) way to PCA
perplexities = c(1,2,5,10,50,100)
ps = list()
for(i in 1:length(perplexities)){
tsne = read.csv(paste0('./../Visualisations/tsne_perplexity_',perplexities[i],'.csv'))
plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() +
scale_color_viridis() + ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none')
}
grid.arrange(grobs=ps, nrow=2)
rm(perplexities, ps, tsne, i)
Only 2 genes (~0.013% vs ~28% in Gandal’s dataset) are significant using a threshold of 0.05 for the adjusted p-value and a without a log Fold Change threshold (keeping the null hypothesis \(H_0: LFC=0\))
table(DE_info$padj<0.05, useNA='ifany')
##
## FALSE TRUE
## 15390 2
p = DE_info %>% ggplot(aes(log2FoldChange, padj, color=significant)) + geom_point(alpha=0.2) +
scale_y_sqrt() + xlab('log2 Fold Change') + ylab('Adjusted p-value') + theme_minimal()
ggExtra::ggMarginal(p, type = 'density', color='gray', fill='gray', size=10)
rm(p)
There is a negative relation between LFC and mean expression
The relation is strongest for genes with low levels of expression
plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% left_join(DE_info, by='ID')
plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange))) +
geom_point(alpha = 0.3, color='#0099cc') + geom_smooth(method='lm', color = 'gray') +
theme_minimal() + scale_y_sqrt() + theme(legend.position = 'bottom') +
xlab('Mean Expression') + ylab('LFC Magnitude') +
ggtitle('Log fold change by level of expression')
List of DE genes
# Get genes names
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=plot_data$ID, mart=mart) %>%
rename(external_gene_id = 'gene_name', ensembl_gene_id = 'ID')
top_genes = plot_data %>% left_join(gene_names, by='ID') %>% filter(padj<0.05) %>% arrange(-abs(log2FoldChange))
kable(top_genes %>% dplyr::select(ID, gene_name, log2FoldChange, padj, Neuronal))
| ID | gene_name | log2FoldChange | padj | Neuronal |
|---|---|---|---|---|
| ENSG00000133316 | WDR74 | -2.5677696 | 0.0000011 | 0 |
| ENSG00000101412 | E2F1 | -0.4326021 | 0.0143666 | 0 |
rm(top_genes)
No point in doing this having only 2 DE genes
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.28 biomaRt_2.40.5
## [3] DESeq2_1.24.0 SummarizedExperiment_1.14.1
## [5] DelayedArray_0.10.0 BiocParallel_1.18.1
## [7] matrixStats_0.56.0 Biobase_2.44.0
## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0
## [11] IRanges_2.18.3 S4Vectors_0.22.1
## [13] BiocGenerics_0.30.0 ClusterR_1.2.1
## [15] gtools_3.8.2 expss_0.10.2
## [17] Rtsne_0.15 ggpubr_0.2.5
## [19] magrittr_1.5 GGally_1.5.0
## [21] gridExtra_2.3 viridis_0.5.1
## [23] viridisLite_0.3.0 RColorBrewer_1.1-2
## [25] plotlyutils_0.0.0.9000 plotly_4.9.2
## [27] glue_1.4.1 reshape2_1.4.4
## [29] forcats_0.5.0 stringr_1.4.0
## [31] dplyr_1.0.0 purrr_0.3.4
## [33] readr_1.3.1 tidyr_1.1.0
## [35] tibble_3.0.1 ggplot2_3.3.2
## [37] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.8 Hmisc_4.4-0
## [4] plyr_1.8.6 lazyeval_0.2.2 splines_3.6.3
## [7] gmp_0.5-13.6 crosstalk_1.1.0.1 digest_0.6.25
## [10] htmltools_0.4.0 fansi_0.4.1 checkmate_2.0.0
## [13] memoise_1.1.0 cluster_2.1.0 annotate_1.62.0
## [16] modelr_0.1.6 prettyunits_1.1.1 jpeg_0.1-8.1
## [19] colorspace_1.4-1 blob_1.2.1 rvest_0.3.5
## [22] haven_2.2.0 xfun_0.12 crayon_1.3.4
## [25] RCurl_1.98-1.2 jsonlite_1.7.0 genefilter_1.66.0
## [28] survival_3.1-12 gtable_0.3.0 zlibbioc_1.30.0
## [31] XVector_0.24.0 scales_1.1.1 DBI_1.1.0
## [34] miniUI_0.1.1.1 Rcpp_1.0.4.6 xtable_1.8-4
## [37] progress_1.2.2 htmlTable_1.13.3 foreign_0.8-76
## [40] bit_1.1-15.2 Formula_1.2-3 htmlwidgets_1.5.1
## [43] httr_1.4.1 acepack_1.4.1 ellipsis_0.3.1
## [46] pkgconfig_2.0.3 reshape_0.8.8 XML_3.99-0.3
## [49] farver_2.0.3 nnet_7.3-14 dbplyr_1.4.2
## [52] locfit_1.5-9.4 later_1.0.0 tidyselect_1.1.0
## [55] labeling_0.3 rlang_0.4.6 AnnotationDbi_1.46.1
## [58] munsell_0.5.0 cellranger_1.1.0 tools_3.6.3
## [61] cli_2.0.2 generics_0.0.2 RSQLite_2.2.0
## [64] broom_0.5.5 fastmap_1.0.1 evaluate_0.14
## [67] yaml_2.2.1 bit64_0.9-7 fs_1.4.0
## [70] nlme_3.1-147 mime_0.9 ggExtra_0.9
## [73] xml2_1.2.5 compiler_3.6.3 rstudioapi_0.11
## [76] curl_4.3 png_0.1-7 ggsignif_0.6.0
## [79] reprex_0.3.0 geneplotter_1.62.0 stringi_1.4.6
## [82] highr_0.8 lattice_0.20-41 Matrix_1.2-18
## [85] vctrs_0.3.1 pillar_1.4.4 lifecycle_0.2.0
## [88] data.table_1.12.8 bitops_1.0-6 httpuv_1.5.2
## [91] R6_2.4.1 latticeExtra_0.6-29 promises_1.1.0
## [94] assertthat_0.2.1 withr_2.2.0 GenomeInfoDbData_1.2.1
## [97] mgcv_1.8-31 hms_0.5.3 grid_3.6.3
## [100] rpart_4.1-15 rmarkdown_2.1 shiny_1.4.0.2
## [103] lubridate_1.7.4 base64enc_0.1-3